#### R Code for Manuscript "Carnivorous Nepenthes pitchers with less acidic fluid house nitrogen-fixing bacteria" 
### Code written by authors Wolock and Bittleston

# Set global options --------------------------------------------------------------------
## global options

options(
      
    stringsAsFactors = FALSE,
    
    width = 180,
  
    scipen = 7
)


# install and load packages ------------------------------------------------
## function to install and load packages
ipak <- function (pkg) {
    
    # attach only packages that have not already been attached
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]    
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
    
}

# list of required packages
packages <- c("effects", "nlme", "lme4", "MASS", "ggplot2", "caret", "car", 
              "scales", "Rcpp","qiime2R","vegan","gridExtra","gtable","grid","RColorBrewer","reshape")

# install packages
ipak(packages)

# format scientific notation --------------------------------------------------
## function to format scientific notation 
scientific_10 <- function(x) {
  
    parse(text=gsub("e", " %*% 10^", scientific_format()(x)))
    
}

# --------------------------------------------------------------------------------------
## import data ####

# set working directory
directory <- "~/Dropbox/Leonora/Current_Manuscripts/Nif_paper" ### Change to your working directory

setwd(directory)

# load data for all samples
alldata <- read.csv("nifH_data_all_updated.csv")

# load proportions data
propdata <- read.csv("PCR_proportions.csv")

# load metagenome data
mgdata <- read.csv("metagenome_nifH_data.csv")

# load ASV data
asv16s <- read.csv("NifH_project_16S_ASV_table_all.csv", check.names = F, row.names = 1)

# load taxonomy data for ASVs
asv16s.tax <- read.csv("NifH_project_16S_ASV_taxonomy_all.csv")

# --------------------------------------------------------------------------------------
## convert variables and set reference level for models ####

# convert some variables to factors
facvars <- c("PCRSuccess", "Year","Site", "HostSpecies")

alldata[, facvars] <- lapply(alldata[, facvars], factor)


propdata$HostSpecies <- as.factor(propdata$HostSpecies)

mgdata$Species <- as.factor(mgdata$Species)

# relevel to set N. gracilis as reference level for first 4 models
alldata$HostSpecies <- ordered(alldata$HostSpecies, levels =c("N. gracilis","N. rafflesiana", "N. ampullaria"))

propdata$HostSpecies <- ordered(propdata$HostSpecies, levels =c("N. gracilis","N. rafflesiana", "N. ampullaria"))

# relevel to set N. stenophylla as reference level for last model
mgdata$Species <- ordered(mgdata$Species, levels =c("steno","veit","reint","hirs"))

# Models --------------------------------------------------------------------------------

##### Predicted metagenome data ####
### Linear mixed effects model for both years of data, Host species and DNA concentration
lme1 <- lme(sqrt(nifHAbundance) ~ HostSpecies + DNAConc, random =list(~1|Year, ~1|Site), data = alldata, na.action=na.omit)
summary(lme1)
anova(lme1, type = "marginal")

plot(lme1)
qqnorm(resid(lme1))
qqline(resid(lme1))

# plot effects
plot(allEffects(lme1))

# vif evaluation for collinearity
vif(lme1)


## Linear mixed effects model with 2013 data, includes pH and volume
lme2 <- lme(sqrt(nifHAbundance) ~ HostSpecies + pH + VolumeTotal + DNAConc, random = ~1|Site, data = alldata, na.action=na.omit)

summary(lme2)
anova(lme2, type = "marginal")

# diagnostic plots
plot(lme2)
qqnorm(resid(lme2))
qqline(resid(lme2))

# plot effects
plot(allEffects(lme2))

# vif evaluation for collinearity
vif(lme2)

# PCR data -------------------------------------------------------------------------

## generalized linear mixed models for nifH presence/absence

## GLMM for nifH presence/absence, all samples

glmer1 <- glmer(PCRSuccess ~ HostSpecies + DNAConc + (1|Year) + (1|Site), family = 
              binomial(link = "logit"), data = alldata)

summary(glmer1)
anova(glmer1)

# plots effects
plot(allEffects(glmer1))

# diagnostic plots
plot(glmer1)

# odds ratios
exp(fixef(glmer1))
confint.merMod(glmer1, method = "Wald")

# vif evaluation for collinearity
vif(glmer1)

## GLM for nifH presence/absence, only samples with pH measurements

glmer2 <- glmer(PCRSuccess ~ HostSpecies + pH + DNAConc + VolumeTotal + (1|Site), family = 
                  binomial(link = "logit"), data = alldata)

summary(glmer2)
anova(glmer2)

# plot effects
plot(allEffects(glmer2), rescale.axis=F, ylab="Log Odds of nifH Presence")

# diagnostic plots
plot(glmer2)

# odds ratios
exp(fixef(glmer2))
confint.merMod(glmer2, method = "Wald")

# vif evaluation for collinearity
vif(glmer2)

# Metagenomic data ---------------------------------------------------------------------

## Linear model for nifH normalized abundance in metagenomic data

lm1 <- lm(NifH_norm_aa90 ~ pH + Species, data = mgdata)

summary(lm1)
anova(lm1)

# diagnostic plots
plot(lm1)

# plot effects
plot(allEffects(lm1))

# vif evaluation for collinearity
vif(lm1)

#---------------------------------------------------------------------------------

## Statistical tests ####

# pairwise host species lists for all possible comparisons
pairwise.list <- list(subset(alldata, HostSpecies=="N. ampullaria" | HostSpecies=="N. gracilis",,),
    subset(alldata, HostSpecies=="N. ampullaria" | HostSpecies=="N. rafflesiana",,),
    subset(alldata, HostSpecies=="N. rafflesiana" | HostSpecies=="N. gracilis",,))

# compare nifH abundance by host species using Mann-Whitney U tests
p.vals1 <- lapply(pairwise.list, 
    function(x) wilcox.test(nifHAbundance ~ HostSpecies, data = x)$p.value)

stat.vals1 <- lapply(pairwise.list, 
                  function(x) wilcox.test(nifHAbundance ~ HostSpecies, data = x)$statistic)

# compare pH by host species using Mann-Whitney U tests
p.vals2 <- lapply(pairwise.list,
    function(x) wilcox.test(pH ~ HostSpecies, data = x)$p.value)
stat.vals2 <- lapply(pairwise.list, 
                     function(x) wilcox.test(pH ~ HostSpecies, data = x)$statistic)

# make table of PCR results, separated by species
tbl = table(alldata$HostSpecies, alldata$PCRSuccess)

# chi square proportion test for pcr success, ampullaria vs gracilis
p.val3 <- prop.test(x = c(tbl[2,"y"], tbl[1,"y"]), n = c(tbl[2,"y"]+tbl[2,"n"], 
    tbl[1,"y"]+tbl[1,"n"]), alternative="two.sided", correct = FALSE)$p.value

stat.val3 <- prop.test(x = c(tbl[2,"y"], tbl[1,"y"]), n = c(tbl[2,"y"]+tbl[2,"n"], 
                                                         tbl[1,"y"]+tbl[1,"n"]), alternative="two.sided", correct = FALSE)$statistic

# chi square proportion test for pcr success, ampullaria vs rafflesiana
p.val4 <- prop.test(x = c(tbl[2,"y"], tbl[3,"y"]), n = c(tbl[2,"y"]+tbl[2,"n"], 
    tbl[3,"y"]+tbl[3,"n"]), alternative="two.sided", correct = FALSE)$p.value

stat.val4 <- prop.test(x = c(tbl[2,"y"], tbl[3,"y"]), n = c(tbl[2,"y"]+tbl[2,"n"], 
                                                         tbl[3,"y"]+tbl[3,"n"]), alternative="two.sided", correct = FALSE)$statistic
# chi square proportion test for pcr success, rafflesiana vs gracilis
p.val5 <- prop.test(x = c(tbl[3,"y"], tbl[1,"y"]), n = c(tbl[3,"y"]+tbl[3,"n"], 
    tbl[1,"y"]+tbl[1,"n"]), alternative="two.sided", correct = FALSE)$p.value

stat.val5 <- prop.test(x = c(tbl[3,"y"], tbl[1,"y"]), n = c(tbl[3,"y"]+tbl[3,"n"],
                                                         tbl[1,"y"]+tbl[1,"n"]), alternative="two.sided", correct = FALSE)$statistic
# bonferroni correction on all p-values
p.vals <- p.adjust(c(p.vals1, p.vals2, p.val3, p.val4, p.val5))
stat.vals <- c(stat.vals1,stat.vals2,stat.val3,stat.val4,stat.val5)

#---------------------------------------------------------------------------------------
## Figures for non-ASV data ####

# Singaporean host species labels
species_labels <- c(expression(paste(italic("N. gracilis"))), 
                    expression(paste(italic("N. rafflesiana"))), 
                    expression(paste(italic("N. ampullaria")))) 

# Bornean host species labels
species_labels_b <- c(expression(paste(italic("N. stenophylla"))), 
                      expression(paste(italic("N. veitchii"))), 
                      expression(paste(italic("N. reintwardtiana"))),
                      expression(paste(italic("N. hirsuta")))) 

# y-axis labels
label1 <- c(expression(paste(italic("nifH"), " relative abundance (sqrt)"))) 

label2 <- c(expression(paste("Count of samples with ", italic("nifH"))))

label3 <- c(expression(paste("Proportion of samples with ", italic("nifH"))))

label4 <- c(expression(paste("Normalized ", italic("nifH"), " rel. abundance")))


## Colors
Hsp <- c("#ef8a62","#5ab4ac","#998ec3")
Bsp <- c("#e69f00","#56b4e9","#cc79a7","#332288")

### Figure 1 left column - Host species: pH, rel abund predicted nifH, proportion PCR nifH
f1p1 <- ggplot(data=alldata, aes(x=HostSpecies, y=pH, fill=HostSpecies)) +
  geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) + scale_fill_manual(values=Hsp) + 
  theme_classic() + theme(legend.position="none",axis.title.x=element_blank(),axis.text.x=element_blank())
f1p2 <- ggplot(data=alldata, aes(x=HostSpecies, y=sqrt(nifHAbundance),fill=HostSpecies)) +
  geom_boxplot(outlier.shape = NA) + geom_jitter(width = 0.2) + ylab(label1) + scale_fill_manual(values=Hsp) + 
  theme_classic() + theme(legend.position="none",axis.title.x=element_blank(),axis.text.x=element_blank())
f1p3 <- ggplot(data=propdata, aes(x=HostSpecies, y=Proportion, fill=HostSpecies)) + 
  geom_bar(stat="identity") + ylab(label3) + xlab("Host Species") + scale_fill_manual(values=Hsp) + 
  geom_text(aes(label = Together), vjust = -.5) + ylim(0,.9) +
  theme_classic() + theme(legend.position="none",axis.text.x=element_text(face="italic"))

#grid.arrange(f1p1,f1p2,f1p3,ncol=1)

f1g1 <- ggplotGrob(f1p1)
f1g2 <- ggplotGrob(f1p2)
f1g3 <- ggplotGrob(f1p3)
f1g <- rbind(f1g1, f1g2, f1g3, size = "first")
f1g$widths <- unit.pmax(f1g1$widths, f1g2$widths, f1g3$widths)

### Figure 1, right column: metagenome samples with their pH levels
f1p4 <- ggplot(data=mgdata, aes(x=Species, y=pH, colour=Species, shape=Species)) +
  geom_point(size=3) + scale_colour_manual(values=Bsp) + theme_classic() +
  theme(legend.position="none",axis.title.x=element_blank(),axis.text.x=element_blank())
f1p5 <- ggplot(data=mgdata, aes(x=Species, y=NifH_norm_aa90, colour=Species, shape=Species)) +
  geom_point(size=3) + scale_colour_manual(values=Bsp) + ylab(label4) + xlab("Host Species") + scale_x_discrete(labels= species_labels_b) +
  theme_classic() + theme(legend.position="none") 
f1g4 <- ggplotGrob(f1p4)
f1g5 <- ggplotGrob(f1p5)
f1gr <- rbind(f1g4, f1g5, size = "first")
f1gr$widths <- unit.pmax(f1g4$widths, f1g5$widths)

grid.arrange(f1g,f1gr,ncol=2)

### Figure 2: data plots and effect plots for pH 
f2p1 <- ggplot(data=alldata, aes(x=pH, y=sqrt(nifHAbundance))) +
  geom_point(alpha = 0.6)  +
  ylab(label1) +
  theme_classic() + theme(legend.position="none")
f2p1

f2p2 <- plot(effect("pH", lme2), rescale.axis=F, ylab=label1, main = F)
f2p2

alldataPCR <- subset(alldata, !is.na(PCRSuccess))
GrBlk <- c("#D3D3D3","#000000")
f2p3 <- ggplot(data=alldataPCR, aes(x=pH)) +
  geom_histogram(stat="bin",aes(fill=PCRSuccess),binwidth = 1,position='dodge')+
  theme_classic() + scale_fill_manual(values=GrBlk) + ylab(label2) + 
  theme(legend.title=element_blank(), legend.position = c(0.9, 0.8)) +
  scale_x_continuous(limits = c(0,7), breaks = c(0,1,2,3,4,5,6,7))
f2p3

f2p4 <- plot(effect("pH", glmer2), ylab=label3, rescale.axis=F, main = F)
f2p4

f2p5 <- ggplot(data=mgdata, aes(x=pH, y=NifH_norm_aa90)) +
  geom_point(alpha=0.6) +
  ylab(label4) +
  theme_classic() + theme(legend.position="none")
f2p5

f2p6 <- plot(effect("pH", lm1), ylab=label4, rescale.axis=F, main = F)
f2p6

grid.arrange(f2p1,f2p3,f2p5,f2p2,f2p4,f2p6,ncol=3)


# ASV data --------------------------------------------------------------------------

### Separate taxonomic categories into columns
asv16s.tax2 <- parse_taxonomy(asv16s.tax[,1:2])

### Remove ASVs matching to mitochondria in the family level taxonomy
asv16s.tax2 <- asv16s.tax2[grep("Mitochondria",asv16s.tax2$Family, invert = T),]

### Also remove the mitochondria ASVs from ASV table
asv16s <- subset(asv16s, row.names(asv16s) %in% row.names(asv16s.tax2))

# Remove rows with NAs under the "nifHAbundance" column for alldata
alldata2 <- alldata[complete.cases(alldata$nifHAbundance), ]

## Subset ASV table to match metadata ####
asv16s.nifH <- asv16s[, intersect(colnames(asv16s), alldata2$Sample)]

# Subset alldata2 to match new asv16s.nifH
alldata3 <- subset(alldata2, alldata2$Sample %in% colnames(asv16s.nifH))

# Find column names in alldata2$Sample that are not present in colnames(asv16s.nifH)
missingSamples <- setdiff(alldata2$Sample, colnames(asv16s.nifH))

# View the missing sample names
missingSamples

# Remove ASVs no longer present and summarize
asv16s.nifH <- asv16s.nifH[rowSums(asv16s.nifH) > 1,] ## remove ASVs no longer present
summary(rowSums(asv16s.nifH))
summary(colSums(asv16s.nifH)) 


### Rarefaction ####
summary(rowSums(asv16s.nifH))
summary(colSums(asv16s.nifH))

asv16s.nifH.r <- rrarefy(t(asv16s.nifH),7749)
asv16s.nifH.r <- asv16s.nifH.r[,colSums(asv16s.nifH.r) > 0] ## remove ASVs no longer present
summary(rowSums(asv16s.nifH.r))
summary(colSums(asv16s.nifH.r))

# Make sure ASVs match
asv16s.nifH.tax <- subset(asv16s.tax2, row.names(asv16s.tax2) %in% colnames(asv16s.nifH.r))


##### Family level barplot ####
asv16s.nifHtf <- data.frame(Family=asv16s.nifH.tax$Family,t(asv16s.nifH.r), check.names = F)
asv16s.nifHtf$Family[is.na(asv16s.nifHtf$Family)] <- "Unknown"
asv16s.nifHtfa <- aggregate(. ~ asv16s.nifHtf$Family, asv16s.nifHtf[,2:ncol(asv16s.nifHtf)], sum) 
row.names(asv16s.nifHtfa) <- asv16s.nifHtfa[,1]
asv16s.nifHtfa <- asv16s.nifHtfa[,2:ncol(asv16s.nifHtfa)]

asv16s.nifHtfao <- as.matrix(asv16s.nifHtfa[order(rowSums(asv16s.nifHtfa),decreasing = T),])

asv16s.nifHtfaot <- asv16s.nifHtfao[1:12,]
other2 <- colSums(asv16s.nifHtfao[13:nrow(asv16s.nifHtfao),])
asv16s.nifHtfaot2 <- rbind(asv16s.nifHtfaot,other2)

### Sort by nifH abundance in alldata3
alldata3$Sample == colnames(asv16s.nifHtfaot2)
asv16s.nifHtfaot2 <- asv16s.nifHtfaot2[, order(alldata3$nifHAbundance)]

## Color based on potential nfixing groups
customcol13 <- c("darkblue","tomato1","firebrick","cadetblue4","dodgerblue2","forestgreen",
                 "goldenrod1","chocolate1","red","hotpink","cyan","mediumvioletred","gray")

### Figure 3a: family  
barplot(asv16s.nifHtfaot2,col=customcol13,legend.text=T,axes=F,cex.names = .3,las=2, args.legend = list(x = "topleft", bty = "n", inset=c(-0.06, -0.06), cex=0.5))


## ANCOM ####
#Detect ASVs playing significant role in driving trends for specified factor

#Load Function 'ANCOM.main' from S. Mandal (2017)####
ANCOM.main = function(OTUdat,Vardat,
                      adjusted,repeated,
                      main.var,adj.formula,
                      repeat.var,longitudinal,
                      random.formula,
                      multcorr,sig,
                      prev.cut){
  
  p.zeroes=apply(OTUdat[,-1],2,function(x){
    s=length(which(x==0))/length(x)
  })
  
  zeroes.dist=data.frame(colnames(OTUdat)[-1],p.zeroes,row.names=NULL)
  colnames(zeroes.dist)=c("Taxon","Proportion_zero")
  
  zero.plot = ggplot(zeroes.dist, aes(x=Proportion_zero)) + 
    geom_histogram(binwidth=0.1,colour="black",fill="white") + 
    xlab("Proportion of zeroes") + ylab("Number of taxa") +
    theme_bw()
  
  #print(zero.plot)
  
  OTUdat.thinned=OTUdat
  OTUdat.thinned=OTUdat.thinned[,c(1,1+which(p.zeroes<prev.cut))]
  
  asv.names=colnames(OTUdat.thinned)[-1]
  
  W.detected   <- ancom.W(OTUdat.thinned,Vardat,
                          adjusted,repeated,
                          main.var,adj.formula,
                          repeat.var,longitudinal,random.formula,
                          multcorr,sig)
  
  W_stat       <- W.detected
  
  
  ### Bubble plot
  
  W_frame = data.frame(asv.names,W_stat,row.names=NULL)
  W_frame = W_frame[order(-W_frame$W_stat),]
  
  W_frame$detected_0.9=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.8=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.7=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.6=rep(FALSE,dim(W_frame)[1])
  
  W_frame$detected_0.9[which(W_frame$W_stat>0.9*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.8[which(W_frame$W_stat>0.8*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.7[which(W_frame$W_stat>0.7*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.6[which(W_frame$W_stat>0.6*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  
  final_results=list(W_frame,zero.plot)
  names(final_results)=c("W.taxa","PLot.zeroes")
  return(final_results)
}

#Load Function 'ANCOM.w' from S. Mandal (2017) ####
ancom.W = function(otu_data,var_data,
                   adjusted,repeated,
                   main.var,adj.formula,
                   repeat.var,long,rand.formula,
                   multcorr,sig){
  
  n_otu=dim(otu_data)[2]-1
  
  otu_ids=colnames(otu_data)[-1]
  
  if(repeated==F){
    data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var)],by="Sample.ID",row.names=NULL), check.names = F)
    #data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var)],by="Sample.ID",all.y=T),row.names=NULL)
  }else if(repeated==T){
    data_comp=data.frame(merge(otu_data,var_data,by=otu_data$Sample.ID),row.names=NULL, check.names=F)
    # data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var,repeat.var)],by="Sample.ID"),row.names=NULL)
  }
  
  base.formula = paste0("lr ~ ",main.var)
  if(repeated==T){
    repeat.formula = paste0(base.formula," | ", repeat.var)
  }
  if(adjusted==T){
    adjusted.formula = paste0(base.formula," + ", adj.formula)
  }
  
  if( adjusted == F & repeated == F ){
    fformula  <- formula(base.formula)
  } else if( adjusted == F & repeated == T & long == T ){
    fformula  <- formula(base.formula)   
  }else if( adjusted == F & repeated == T & long == F ){
    fformula  <- formula(repeat.formula)   
  }else if( adjusted == T & repeated == F  ){
    fformula  <- formula(adjusted.formula)   
  }else if( adjusted == T & repeated == T  ){
    fformula  <- formula(adjusted.formula)   
  }else{
    stop("Problem with data. Dataset should contain OTU abundances, groups, 
         and optionally an ID for repeated measures.")
  }
  
  if( repeated==FALSE & adjusted == FALSE){
    if( length(unique(data_comp[,which(colnames(data_comp)==main.var)]))==2 ){
      tfun <- exactRankTests::wilcox.exact
    } else{
      tfun <- stats::kruskal.test
    }
  }else if( repeated==FALSE & adjusted == TRUE){
    tfun <- stats::aov
  }else if( repeated== TRUE & adjusted == FALSE & long == FALSE){
    tfun <- stats::friedman.test
  }else if( repeated== TRUE & adjusted == FALSE & long == TRUE){
    tfun <- nlme::lme
  }else if( repeated== TRUE & adjusted == TRUE){
    tfun <- nlme::lme
  }
  
  logratio.mat <- matrix(NA, nrow=n_otu, ncol=n_otu)
  for(ii in 1:(n_otu-1)){
    for(jj in (ii+1):n_otu){
      data.pair <- data_comp[,which(colnames(data_comp)%in%otu_ids[c(ii,jj)])]
      lr <- log((1+as.numeric(data.pair[,1]))/(1+as.numeric(data.pair[,2])))
      
      datalr_dat <- data.frame( lr=lr, data_comp,row.names=NULL )
      
      if(adjusted==FALSE&repeated==FALSE){  ## Wilcox, Kruskal Wallis
        logratio.mat[ii,jj] <- tfun( formula=fformula, data = datalr_dat)$p.value
      }else if(adjusted==FALSE&repeated==TRUE&long==FALSE){ ## Friedman's 
        logratio.mat[ii,jj] <- tfun( formula=fformula, data = datalr_dat)$p.value
      }else if(adjusted==TRUE&repeated==FALSE){ ## ANOVA
        model=tfun(formula=fformula, data = datalr_dat,na.action=na.omit)   
        picker=which(gsub(" ","",row.names(summary(model)[[1]]))==main.var)  
        logratio.mat[ii,jj] <- summary(model)[[1]][["Pr(>F)"]][picker]
      }else if(repeated==TRUE&long==TRUE){ ## GEE
        model=tfun(fixed=fformula,data = datalr_dat,
                   random = formula(rand.formula),
                   correlation=corAR1(),
                   na.action=na.omit)   
        picker=which(gsub(" ","",row.names(anova(model)))==main.var)
        logratio.mat[ii,jj] <- anova(model)[["p-value"]][picker]
      }
      
    }
  } 
  
  ind <- lower.tri(logratio.mat)
  logratio.mat[ind] <- t(logratio.mat)[ind]
  
  
  logratio.mat[which(is.finite(logratio.mat)==FALSE)] <- 1
  
  mc.pval <- t(apply(logratio.mat,1,function(x){
    s <- p.adjust(x, method = "BH")
    return(s)
  }))
  
  a <- logratio.mat[upper.tri(logratio.mat,diag=FALSE)==TRUE]
  
  b <- matrix(0,ncol=n_otu,nrow=n_otu)
  b[upper.tri(b)==T] <- p.adjust(a, method = "BH")
  diag(b)  <- NA
  ind.1    <- lower.tri(b)
  b[ind.1] <- t(b)[ind.1]
  
  
  ### Code to extract surrogate p-value ####
  surr.pval <- apply(mc.pval,1,function(x){
    s0=quantile(x[which(as.numeric(as.character(x))<sig)],0.95)
    # s0=max(x[which(as.numeric(as.character(x))<alpha)])
    return(s0)
  })
  
  ### Conservative
  if(multcorr==1){
    W <- apply(b,1,function(x){
      subp <- length(which(x<sig))
    })
    ### Moderate
  } else if(multcorr==2){
    W <- apply(mc.pval,1,function(x){
      subp <- length(which(x<sig))
    })
    ### No correction
  } else if(multcorr==3){
    W <- apply(logratio.mat,1,function(x){
      subp <- length(which(x<sig))
    })
  }
  
  return(W)
  }
##### End of functions ####

#ANCOM #####
#Create "Sample.ID" column all data tables
#ANCOM requires that data be formatted so that first *column* is named "Sample.ID"
#Sample IDs as row names does not count!
asv16s.nifH_sbst <- data.frame("Sample.ID" = row.names(asv16s.nifH.r), asv16s.nifH.r, check.names = F)
colnames(alldata3)[colnames(alldata3) == "Sample"] <- "Sample.ID"

#Check and make sure tables align
alldata3$Sample.ID == asv16s.nifH_sbst$Sample.ID

## Remove samples with NAs in the PCRSuccess variable
alldata3.pcr <- alldata3[!is.na(alldata3$PCRSuccess),]
asv16s.nifH_sbst <- subset(asv16s.nifH_sbst,asv16s.nifH_sbst$Sample.ID %in% alldata3.pcr$Sample.ID)

#Run ANCOM, specify variable
ANCOM.16s <- ANCOM.main(asv16s.nifH_sbst,alldata3.pcr,F,F,"PCRSuccess",NULL,NULL,F,NULL,2,.05,.9)

#Look at the results
head(ANCOM.16s$W.taxa, 30)

#Create objects of significant ASVs
sigASVs_16s <- subset(ANCOM.16s$W.taxa, ANCOM.16s$W.taxa$W_stat > 0)[,1]
### Multiple significant ASVs detected for PCR success

#Subset data by sig ASVs
asv16s.nifH_sigsbst <-  t(subset(t(asv16s.nifH_sbst), colnames(asv16s.nifH_sbst) %in% sigASVs_16s))

#Organize top 5 in "yes" category for plotting
nifHplot <- data.frame(asv16s.nifH_sigsbst[,c(7,14,3,10,8)], "PCRSuccess" = alldata3.pcr$PCRSuccess, check.names = FALSE)
nifHplot[,1:5] <- lapply(nifHplot[,1:5], function(x) as.numeric(as.character(x)))
nifHplot[,1:5] <- log(nifHplot[,1:5]+1)

nifHplot.sub <- data.frame(sample=rownames(nifHplot),nifHplot, check.names = F)
nifHplot.sublong <- melt(nifHplot.sub)
names(nifHplot.sublong)[3] <- c("ASVs")

### Figure 3b: Top 5 ANCOM results from Yes category, ordered by largest effect 
gg.nifplot <- ggplot(nifHplot.sublong, aes(y = value, x = PCRSuccess, color=ASVs))+
  geom_boxplot(outlier.shape = NA, show.legend = F) + 
  geom_point(position=position_dodge(width=0.75), aes(group=ASVs, shape = ASVs), alpha =.6) +
  ylab("Log  rel. abundance") + xlab("nifH PCR Success") + theme_classic() 
gg.nifplot

# create an appropriate viewport.  Modify the dimensions and coordinates as needed
vp.BottomRight <- viewport(height=unit(.5, "npc"), width=unit(0.75, "npc"), 
                           just=c("left","top"), 
                           y=0.5, x=0.25)
# plot your base graphics 
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE),
       widths=c(1,3), heights=c(1,1))
barplot(asv16s.nifHtfaot2,col=customcol13,legend.text=T,axes=F,cex.names = .5,border=NA,space=-0.1,las=2, args.legend = list(x = "topleft", bty = "n", cex=0.5,border=NA, inset=c(-.05, 0)))

# plot the ggplot using the print command
print(gg.nifplot2, vp=vp.BottomRight) ## Final plot had legends moved in Inkscape


